library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
books <- read_csv("data/books.csv")
## 
## ── Column specification ──────────────────────────────────────────────────────────────────────────────
## cols(
##   bookID = col_double(),
##   title = col_character(),
##   authors = col_character(),
##   average_rating = col_double(),
##   isbn = col_character(),
##   isbn13 = col_character(),
##   language_code = col_character(),
##   num_pages = col_double(),
##   ratings_count = col_double(),
##   text_reviews_count = col_double(),
##   publication_date = col_character(),
##   publisher = col_character()
## )
## Warning: 21 parsing failures.
##  row            col           expected            actual             file
## 1570 title          delimiter or quote                   'data/books.csv'
## 1570 title          delimiter or quote I                 'data/books.csv'
## 3349 average_rating a double           Jr./Sam B. Warner 'data/books.csv'
## 3349 num_pages      a double           en-US             'data/books.csv'
## 3349 NA             12 columns         13 columns        'data/books.csv'
## .... .............. .................. ................. ................
## See problems(...) for more details.
dim(books)
## [1] 8472   12
str(books)
## tibble [8,472 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ bookID            : num [1:8472] 1 2 4 5 8 9 10 12 13 14 ...
##  $ title             : chr [1:8472] "Harry Potter and the Half-Blood Prince (Harry Potter  #6)" "Harry Potter and the Order of the Phoenix (Harry Potter  #5)" "Harry Potter and the Chamber of Secrets (Harry Potter  #2)" "Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)" ...
##  $ authors           : chr [1:8472] "J.K. Rowling/Mary GrandPré" "J.K. Rowling/Mary GrandPré" "J.K. Rowling" "J.K. Rowling/Mary GrandPré" ...
##  $ average_rating    : num [1:8472] 4.57 4.49 4.42 4.56 4.78 3.74 4.73 4.38 4.38 4.22 ...
##  $ isbn              : chr [1:8472] "0439785960" "0439358078" "0439554896" "043965548X" ...
##  $ isbn13            : chr [1:8472] "9780439785969" "9780439358071" "9780439554893" "9780439655484" ...
##  $ language_code     : chr [1:8472] "eng" "eng" "eng" "eng" ...
##  $ num_pages         : num [1:8472] 652 870 352 435 2690 ...
##  $ ratings_count     : num [1:8472] 2095690 2153167 6333 2339585 41428 ...
##  $ text_reviews_count: num [1:8472] 27591 29221 244 36325 164 ...
##  $ publication_date  : chr [1:8472] "9/16/2006" "9/1/2004" "11/1/2003" "5/1/2004" ...
##  $ publisher         : chr [1:8472] "Scholastic Inc." "Scholastic Inc." "Scholastic" "Scholastic Inc." ...
##  - attr(*, "problems")= tibble [21 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:21] 1570 1570 3349 3349 3349 4513 4513 4513 4513 4513 ...
##   ..$ col     : chr [1:21] "title" "title" "average_rating" "num_pages" ...
##   ..$ expected: chr [1:21] "delimiter or quote" "delimiter or quote" "a double" "a double" ...
##   ..$ actual  : chr [1:21] " " "I" "Jr./Sam B. Warner" "en-US" ...
##   ..$ file    : chr [1:21] "'data/books.csv'" "'data/books.csv'" "'data/books.csv'" "'data/books.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   bookID = col_double(),
##   ..   title = col_character(),
##   ..   authors = col_character(),
##   ..   average_rating = col_double(),
##   ..   isbn = col_character(),
##   ..   isbn13 = col_character(),
##   ..   language_code = col_character(),
##   ..   num_pages = col_double(),
##   ..   ratings_count = col_double(),
##   ..   text_reviews_count = col_double(),
##   ..   publication_date = col_character(),
##   ..   publisher = col_character()
##   .. )
names(books)
##  [1] "bookID"             "title"              "authors"           
##  [4] "average_rating"     "isbn"               "isbn13"            
##  [7] "language_code"      "num_pages"          "ratings_count"     
## [10] "text_reviews_count" "publication_date"   "publisher"
books

Searching for na values

books %>% 
  filter(is.na(authors))
books %>%
  filter(is.na(average_rating))

Renaming columns

rename_books_col <- books %>%
  rename(book_id = bookID)

Removing na values.

books_no_na <- rename_books_col %>%
  filter_at(vars(book_id:publisher),
            all_vars(!is.na(.)))

Top 10 highest rated books, with over 100 rating_count.

top_rated_books <- books_no_na %>%
  select(title, authors, average_rating, ratings_count) %>%
  arrange(desc(average_rating)) %>%
  filter(ratings_count >= 100)
  head(10)
## [1] 10
top_rated_books

Books with over 1000 pages

over_two_thousand <- books_no_na %>%
  select(title, num_pages) %>%
  arrange(desc(num_pages)) %>%
  filter(num_pages >= 2000)

over_two_thousand

Number of books by each author.

book_count <- books_no_na %>%
  count(authors)
book_count

Average pages for a publisher

avg_page_pub <- books_no_na %>%
  group_by(publisher) %>%
  summarise(avg_num_pages = mean(num_pages))
## `summarise()` ungrouping output (override with `.groups` argument)
avg_page_pub